001 /* 002 * RandomFactorSequenceGenerator.java 003 * 004 * Copyright 2003 Sergio Anibal de Carvalho Junior 005 * 006 * This file is part of NeoBio. 007 * 008 * NeoBio is free software; you can redistribute it and/or modify it under the terms of 009 * the GNU General Public License as published by the Free Software Foundation; either 010 * version 2 of the License, or (at your option) any later version. 011 * 012 * NeoBio is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 013 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR 014 * PURPOSE. See the GNU General Public License for more details. 015 * 016 * You should have received a copy of the GNU General Public License along with NeoBio; 017 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 018 * Boston, MA 02111-1307, USA. 019 * 020 * Proper attribution of the author as the source of the software would be appreciated. 021 * 022 * Sergio Anibal de Carvalho Junior mailto:sergioanibaljr@users.sourceforge.net 023 * Department of Computer Science http://www.dcs.kcl.ac.uk 024 * King's College London, UK http://www.kcl.ac.uk 025 * 026 * Please visit http://neobio.sourceforge.net 027 * 028 * This project was supervised by Professor Maxime Crochemore. 029 * 030 */ 031 032 package neobio.textui; 033 034 import java.io.BufferedWriter; 035 import java.io.Writer; 036 import java.io.FileWriter; 037 import java.io.OutputStreamWriter; 038 import java.io.IOException; 039 040 /** 041 * This class is a simple command line based utility for generating random sequences with 042 * optimal LZ78 factorisation. 043 * 044 * <P>The main method takes three parameters from the command line to generate a 045 * sequence: <CODE>type</CODE>, <CODE>size</CODE> and <CODE>file</CODE>, where: 046 * <UL> 047 * <LI><B><CODE>type</CODE></B> is either <CODE>DNA</CODE> for DNA sequences or 048 * <CODE>PROT</CODE> for protein sequences. 049 * <LI><B><CODE>size</CODE></B> is the number os characters. 050 * <LI><B><CODE>file</CODE></B> (optional) is the name of a file (if ommited, sequence 051 * is written to standard output). 052 * </UL> 053 * </P> 054 * 055 * @author Sergio A. de Carvalho Jr. 056 */ 057 public class RandomFactorSequenceGenerator 058 { 059 /** 060 * Character set for DNA sequences. 061 */ 062 private static final char[] DNA_CHARS = {'A', 'C', 'G', 'T'}; 063 064 /** 065 * Character set for protein sequences. 066 */ 067 private static final char[] PROT_CHARS = {'A','R','N','D','C','Q','E','G','H','I', 068 'L','K','M','F','P','S','T','W','Y','V','B','Z','X'}; 069 070 /** 071 * The main method takes three parameters from the command line to generate a 072 * sequence. See the class description for details. 073 * 074 * @param args command line arguments 075 */ 076 public static void main (String[] args) 077 { 078 Writer output; 079 String seq_type, filename; 080 int size, random; 081 char[] charset; 082 int[] qty; 083 int[] factor; 084 085 try 086 { 087 // get 1st argument (required): file type 088 seq_type = args[0]; 089 090 // get 2nd argument (required): number of characters 091 size = Integer.parseInt(args[1]); 092 } 093 catch (ArrayIndexOutOfBoundsException e) 094 { 095 usage(); 096 System.exit(1); 097 return; 098 } 099 catch (NumberFormatException e) 100 { 101 usage(); 102 System.exit(1); 103 return; 104 } 105 106 // validate character set 107 if (seq_type.equalsIgnoreCase("DNA")) 108 charset = DNA_CHARS; 109 else if (seq_type.equalsIgnoreCase("PROT")) 110 charset = PROT_CHARS; 111 else 112 { 113 // no such option 114 usage(); 115 System.exit(1); 116 return; 117 } 118 119 // validate size 120 if (size <= 3) 121 { 122 System.err.println ("Error: size must be greater than 3."); 123 System.exit(1); 124 return; 125 } 126 127 try 128 { 129 // get 3rd argument (optional): file name 130 filename = args[2]; 131 132 try 133 { 134 // open file for writing 135 output = new BufferedWriter (new FileWriter (filename)); 136 } 137 catch (IOException e) 138 { 139 System.err.println ("Error: couldn't open " + filename + " for writing."); 140 e.printStackTrace(); 141 System.exit(2); 142 return; 143 } 144 } 145 catch (ArrayIndexOutOfBoundsException e) 146 { 147 // file name was ommited, use standard output 148 filename = null; 149 output = new OutputStreamWriter (System.out); 150 } 151 152 // alocate an of characters statistics 153 qty = new int[charset.length]; 154 155 // alocate an array to store the growing factor 156 // its size will be no greather than half sequence size 157 // (in fact, it's much less than that!) 158 factor = new int [size / 2]; 159 160 try 161 { 162 int s = 0, i, f_size = 0; 163 164 // write sequence 165 while (s < size) 166 { 167 // copy previous factor 168 for (i = 0; i < f_size && s < size; i++) 169 { 170 output.write(charset[factor[i]]); 171 172 s++; 173 174 // keep track of how many characters 175 // have been writen of each type 176 qty[factor[i]]++; 177 } 178 179 if (s < size) 180 { 181 182 // choose a character index randomly 183 random = (int) (Math.random() * charset.length); 184 185 // extend factor with the random char index 186 factor[f_size++] = random; 187 188 // keep track of how many characters 189 // have been writen of each type 190 qty[random]++; 191 192 output.write(charset[random]); 193 194 s++; 195 } 196 } 197 198 output.flush(); 199 200 if (filename != null) output.close(); 201 } 202 catch (IOException e) 203 { 204 System.err.println ("Error: failed to write sequence."); 205 e.printStackTrace(); 206 System.exit(2); 207 return; 208 } 209 210 // print character distribution 211 System.out.println("\nCharacter distribution:"); 212 for (int i = 0; i < charset.length; i++) 213 System.err.println(charset[i] + ": " + qty[i]); 214 215 System.exit(0); 216 } 217 218 /** 219 * Prints command line usage. 220 */ 221 private static void usage () 222 { 223 System.err.println( 224 "\nUsage: RandomFactorSequenceGenerator <type> <size> [<file>]\n\n" + 225 "where:\n\n" + 226 " <type> = DNA for nucleotide sequences\n" + 227 " or PROT for protein sequences\n\n" + 228 " <size> = number os characters\n\n" + 229 " <file> = name of a file to where the sequence is to be written\n" + 230 " (if ommited, sequence is written to standard output)" 231 ); 232 } 233 }